1 Introduction

This document contains the workflow for analysis of SGA screens performed with cell cycle restricted alleles of RNH202 and a genome-wide yeast knockout library.

2 Setup

Here, several functions and R packages that are necessary for the analysis are loaded.

# Function for loading and installing packages obtained from 
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
# and adapted to current use case.
# @params ...: Package names as strings
LoadPackages <- function(...){
  
  libs <- unlist(list(...))
  req <- unlist(lapply(libs, require, character.only = TRUE))
  need <- libs[req == FALSE]
  n <- length(need)
  
  if(n > 0) {
    libsmsg <- if(n > 2) {
      paste(paste(need[1:(n-1)], collapse = ", "), ",", sep = "")
      } else { need[1] }
    
    print(libsmsg)
    
    if(n > 1) {
      libsmsg <- paste(libsmsg, " and ", need[n], sep = "")
      }
    
    libsmsg <- paste("The following packages could not be found: ", libsmsg,
                     "\n\r\n\rInstall missing packages?", collapse = "")
    
    if(winDialog(type = c("yesno"), libsmsg) == "YES") {
      install.packages(need)
      lapply(need, require, character.only = TRUE)
    }
  }
}

# Function for plotting heatmaps of colony sizes on a plate
# @params data_table: data frame containing plate number, row, column, and 
#   colony size data for plotting.
# @params plate_number: character or number indicating the plate to be plotted.
# @params fill_col: name of column containing colony size data
# @params source_type: string describing the colony size type (e.g. raw,
#   median-transformed, normalized)
# @params lowerlimit: number indicating the bottom of the plotted data
# @params midpoint: number indicating the median of the plotted data
# @params upperlimit: number indicating the ceiling of the plotted data
# @params breakpoints: number indicating the breaks for the scale
# @params na_colour: string indicating colour for NA values
heatplot <- function(data_table, plate_number,
                     fill_col = "size", source_type = "Raw",
                     lowerlimit = 0, midpoint = 500, upperlimit = 1000,
                     breakpoints = 200, na_colour = "white"){
  
  ## Install and load the package needed
  if (!requireNamespace("tidyverse", quietly = TRUE)) {
    install.packages("tidyverse")
  
  require(tidyverse)
  }
  
  ## Plotting the data using geom_tile from ggplot2
  ggplot(data = data_table %>%
           filter(plate == plate_number),
         aes(as.factor(col), reorder(as.factor(row), -row))) +
    geom_tile(aes_string(fill = fill_col)) +
    scale_fill_gradientn(name = paste0("Size (", source_type, ")"),
                         na.value = na_colour,
                         colours = c("lightblue1", "cyan", "black", "yellow3", "yellow"),
                         limits = c(lowerlimit, upperlimit),
                         breaks = seq(lowerlimit, upperlimit, breakpoints)
                         ) +
    coord_fixed(0.8) +
    theme_light(base_size = 26) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    scale_y_discrete(limits = as.character(rev(seq(5, 28, 1))),
                     breaks = as.character(seq(5, 28, 1))) +
    scale_x_discrete(limits = as.character((seq(5, 44, 1))),
                     breaks = as.character(seq(5, 44, 1))) +
    labs(x = "Column", y = "Row",
         subtitle = paste("Plate", plate_number))

}

# Function needed for jackknife function (Taken from SGA Tools)
# @params x: a numeric vector
theta <- function(x){ sd(x, na.rm = T) }

# Function for GO Terms enrichment analysis
# @params background: data frame containing list of all ORFs in a screen with
#   GO Term information.
# @params input: data frame containing list of candidate ORFs from a screen with
#   GO Term information.
# @params BGlist: data frame with a column containing all ORFs (has to be same
#   list of ORFs with background)
# @params Inputlist: data frame with a column containing all candidate ORFs (has
#   to be same list of ORFs with input)
# @params error_table: empty data frame for GO terms that encounter a problem
EnrichGOTERMS <- function(background, input, BGlist, Inputlist){
  
  #### Preparation ####
    if (!requireNamespace("tidyverse", quietly = TRUE)) {
    install.packages("tidyverse")
  
  require(tidyverse)
  }
  
  ## List out all possible unique GO term identifiers as a list from the hits
  # GO parent term identifier is used to calculate p-values
  All_Possible_Identifiers <- unique(input$Gene.goAnnotation.ontologyTerm.parents.identifier)
  
  ## Count what is the number of unique GO term identifiers.
  unique_length <- length(All_Possible_Identifiers)
  
  ## Create a table to hold the calculated p-values from the previous hits table
  # P-value of each GO term is needed, so group by GO Terms
  newtable <- input %>%
    group_by(Gene.goAnnotation.ontologyTerm.parents.identifier,
             Gene.goAnnotation.ontologyTerm.parents.name,
             Gene.goAnnotation.ontologyTerm.parents.namespace) %>%
    # Name summary column as 'pval' which will be set to NA
    summarise(pval = n()) %>%
    ungroup %>%
    # Counts of number of ORFs in the hits and in the population are also needed
    mutate(pval = NA,
           Hits = NA,
           Population = NA)
  
  ## Enrichment calculation using hyper geometric distribution
  # A loop is used to go through all GO terms within the data set and perform
  #   the calculations needed.
  # 'All_Possible_Identifiers' is used here as it is from the background data
  #   and should encompass all the terms for the hits as well.
  for(x in 1:unique_length) {
    
    ## Print message to show which GO ID is being checked
    print(paste0(x, "/", unique_length,
                 " Analyzing GO ID: ", All_Possible_Identifiers[x],
                 " - GO Term: ", unique((background %>%
           filter(Gene.goAnnotation.ontologyTerm.parents.identifier ==
                    All_Possible_Identifiers[x])
           )$Gene.goAnnotation.ontologyTerm.parents.name)))
    
    ## Perform calculations
    # Calculate all the needed data/numbers for phyper function
    # The number of ORFs must be unique and not repeated for each GO Term
    
    hits_anno <- length(unique(
      (input %>%
         filter(Gene.goAnnotation.ontologyTerm.parents.identifier ==
                  All_Possible_Identifiers[x])
       )$Gene.secondaryIdentifier))
    
    pop_anno <- length(unique(
      (background %>%
         filter(Gene.goAnnotation.ontologyTerm.parents.identifier ==
                  All_Possible_Identifiers[x])
       )$Gene.secondaryIdentifier))
    
    hits_length <- nrow(Inputlist)
    
    pop_length <- nrow(BGlist)
    
    # P-value is calculated using phyper function as annotated on the
    # YeastMine API documentation
    pvalue <- phyper(q = hits_anno-1, m = pop_anno,
                     n = pop_length-pop_anno, k = hits_length,
                     lower.tail = FALSE)
  
    ## Store the calculated p-value and ORF counts
    newtable$pval[newtable$Gene.goAnnotation.ontologyTerm.parents.identifier ==
                    All_Possible_Identifiers[x]] <- pvalue
    
    newtable$Hits[newtable$Gene.goAnnotation.ontologyTerm.parents.identifier ==
                    All_Possible_Identifiers[x]] <- hits_anno
    
    newtable$Population[newtable$Gene.goAnnotation.ontologyTerm.parents.identifier ==
                          All_Possible_Identifiers[x]] <- pop_anno
    }
  
  return(newtable)
  
}
# Getting the main working directory
PrimaryDirectory = getwd()

# Loading the required R packages
LoadPackages("tidyverse", "kableExtra", "gdata", "openxlsx", "gridExtra", "grid")

# Setting the ggplot theme
theme_set(theme_light(base_size = 26))

# Listing the rows for a 1536 array
alphabet1536 <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
                  "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X",
                  "Y", "Z", "AA", "AB", "AC", "AD", "AE", "AF")

# Import an Excel file containing the coordinates of the border colonies
BorderColonies <- read.xlsx("BorderColoniesList.xlsx")

2.1 SGA Tools

Additionally, SGA tools from https://github.com/boonelab/sgatools/blob/master/public/SGAtools/SGAtools.R are loaded for the normalizing functions that will be used in this analysis. The functions related to normalization were modified as described below.

####################################################################################
# SGATools v1.2
# Tools for image processing, normalizing and scoring Synthetic Genetic Array screens.
#
# This software is in the public domain, furnished "as is", without technical
# support, and with no warranty, express or implied, as to its usefulness for
# any purpose.
#
# Author:         Omar Wagih
# Licence:        Academic Free Licence v3.0
# Language:       English (CA)
# Last modified:  19/11/12
# 
# Modification History 
# --------------------
# Dated  Version        Who     Description
# ----------------------------------------------------------
# 2020-05  1.1     JF    Edited normalizeSGA function as described below:
#     1)  Removed parameters 'keep.large', "intermediate.data", "linkage.file",
#         "linkage.genes". Creates issues because the parameters are not used.
#     2)  Changed "max.colony.size" param to 3*overall.plate.median
#     3)  Added new parameter "field.to.normalize" because we have different entry
#         points and the column is called differently to the default in the code
#     4)  Changed num.rows and num.cols to get the information from the global
#         environment instead of the data table.
#     5)  Removed "rdbl", "cdbl", and "spots" because of difference in the data
#         table setup
#     6)  Removed (F1) Linkage effect filter
#     7)  For (N1) plate normalization, changed 'colonysize' to parameter called
#         "field.to.normalize"
#     8)  Removed (F2) Big replicates filter
#     9)  Removed (F3) Jackknife filter. This is done later in the analysis at
#         a different point in the analysis.
#     10) Removed setting data to "ncolonysize", capping normalized colony size,
#         and adding status codes steps.
#     11) Removed all items after (N4) Plate Normalization 2, and immediately
#         return plate data after this step.
# 2020-05  1.1     JF    Edited spatialNormalization function as described below:
#     1)  Changed num.rows and num.cols to get the information from the global
#         environment instead of the data table.
#     2)  Removed line where ignore indices is set to NA:
#         ("after.ignore[ignore.ind] = NA")
# 2020-05  1.1     JF    Edited rowcolNormalization function as described below:
#     1)  Removed line where ignore indices is set to NA:
#         ("after.ignore[ignore.ind] = NA")
#     2)  Changed num.rows and num.cols to get the information from the global
#         environment instead of the data table.
# 2020-05  1.1     JF    Edited rowcolNormalizationHelper function as described below:
#     1)  Modified the section to determine window size to be used in lowess
#     2)  Simplified the lowess calculation section
# 2020-05  1.1     JF    Edited fgaussian function as below:
#     1)  Changed the seq helper function to SGAseq because of the change in
#         name of the function
# 2020-05  1.1     JF    Edited seq function as described below:
#     1)  Renamed the function to SGAseq as it was interfering with the base
#         seq() function
# 2020-10  1.2     JF    Packages loading modified
#     1)  Use LoadPackages() to load required libraries
####################################################################################

# Load required libraries 
# "logging" is for log file
# "stringr" is for regex matching functions
# "bootstrap" is for jackknife function
LoadPackages("logging", "stringr", "bootstrap")


addHandler(writeToConsole)

# returns string w/o leading whitespace
trim.leading <- function (x)  sub("^\\s+", "", x)

# returns string w/o trailing whitespace
trim.trailing <- function (x) sub("\\s+$", "", x)

# returns string w/o leading or trailing whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)

SGATOOLS_VERSION = '1.2'
####################################################################################
# Reading section
readSGA <- function(file.paths, file.names=basename(file.paths), ad.paths=NA, replicates=4){
  
  # Length of vectors
  n1 = length(file.paths)
  n2 = length(file.names)
  
  loginfo('Reading input files')#
  
  sga.data.list = lapply(file.paths, function(file.path){
    # Index of the file path
    file.ind = which(file.paths == file.path)
    
    # Name of file 
    file.name = file.names[file.ind]
    
    loginfo('Reading input file (%d/%d) path = %s', file.ind, n1, file.path)
    
    # Read all lines first line, except @ and # symbols
    file.lines = readLines(file.path) 
    comment.meta = file.lines[grepl('@|#|[(]', file.lines)]
    file.lines = file.lines[!grepl('@|#|[(]', file.lines)]
    
    # If first line is Colony Project Data File, skip the first 13 lines 
    if(grepl('Colony Project Data File', file.lines[1], ignore.case=T)){
      loginfo('* Detected boone-lab format, skipping first 13 lines')
      file.lines = file.lines[-(1:13)]
      file.lines = sapply(file.lines, trim)
      file.lines = gsub(pattern='\\s+',replacement='\t',file.lines)
    }
    
    # Read in the data: we only care about the first 3 columns
    sga.data = read.delim(textConnection(file.lines), stringsAsFactors=F, header=F)[1:3]
    names(sga.data) = c('V1', 'V2', 'V3')
    sga.data = sga.data[with(sga.data, order(V1, V2)), ]
    
    loginfo('* Done reading')
    
    # Find number of rows and columns
    num.rows = max(sga.data[[1]])
    num.cols = max(sga.data[[2]])
    
    loginfo('* Column classes = %s', sapply(sga.data, class))
    loginfo('* Number rows = %d', num.rows)
    loginfo('* Number cols = %d', num.cols)
    loginfo('* Data dimension = %s', dim(sga.data))
    
    file.name.metadata = fileNameMetadata(file.name)
    
    loginfo('* Obtaining file name metadata: valid = %s', file.name.metadata$is.valid)
    
    # Add plate id as file name
    sga.data[[4]] = file.name.metadata$filename
    
    rdbl = ceiling(sga.data[[1]]/sqrt(replicates))
    cdbl = ceiling(sga.data[[2]]/sqrt(replicates))
    
    # Default query - index of file
    sga.data[[5]] = as.character(file.ind)
    
    # Do we have valid meta data in the file name
    if(file.name.metadata$is.valid){
      # Update query
      sga.data[[5]] = file.name.metadata$query
    }
    
    # Default arrays as numbers - each replicate has a unique number 
    array.vals = ((cdbl - 1)* (num.rows/sqrt(replicates))) + rdbl
    
    # Map to array definition files
    sga.data[,6:7] = mapArrayDefinition(file.name.metadata, array.vals, 
                                       rdbl, cdbl, ad.paths)
    
    # Set normalized colony size / score / KVP to NA 
    sga.data[,8:10] = NA
    
    # Add comment / meta data lines
    comment(sga.data) = comment.meta
    
    # Add other attributes
    attr(sga.data, 'file.name.metadata') = file.name.metadata
    attr(sga.data, 'num.rows') = num.rows
    attr(sga.data, 'num.cols') = num.cols
    
    # Set column names of data
    names(sga.data) = c('row', 'col', 'colonysize',
                        'plateid', 'query','array', 'array_annot',
                        'ncolonysize', 'score', 'kvp'  )
    sga.data
  })
  
  loginfo('Done reading all files')
  loginfo('-----------------------------------------------')
  return(sga.data.list)
}

mapArrayDefinition <- function(file.name.metadata, array.vals, rdbl, cdbl, ad.paths){
  # If we have array definition files - i.e they are not all NA
  if(! all(is.na(ad.paths))){
    loginfo('* Mapping array definition: number of array definition files = %d', length(ad.paths))
    
    # Get file names of paths
    ad.basenames = basename(ad.paths)
    
    ap.ids = str_extract(tolower(ad.paths), 'plate\\d+')
    if (is.na(ap.ids)) {
        loginfo('* Cherry picker plate name malformed... assume only one plate is available')
        ap.ids = c(file.name.metadata$arrayplateid)
    } else {
        ap.ids = as.numeric(str_extract(ap.ids, '\\d+'))
    }
    
    loginfo('* Mapping array definition: IDs of array plate files = %s', ap.ids)
    
    # Assume no valid array plate id, use the first one we have
    ind = 1
    
    if(file.name.metadata$is.valid){
      # If our file has a valid array plate id, match it to its array definition file path
      ind = which(file.name.metadata$arrayplateid == ap.ids)[1]
    }
    
    if(!is.na(ind)){
      # Read in corresponding array definition file - handle 5 lines?
      ad.data = read.table(ad.paths[ind], sep='\t', skip=5, header=F, stringsAsFactors=F)
      names(ad.data)[1:3] = c('c', 'r', 'Gene')
      
      m=data.frame(r=rdbl, c=cdbl)
      i = apply(m, 1, function(x){
        intersect(which(x[1] == ad.data$r), which(x[2] == ad.data$c))[1]
      })
      good.ind = !is.na(i)
      array.vals[good.ind] = ad.data$Gene[i][good.ind]
    }
    
  }else{
    loginfo('* Not mapping array definition: no array definition files')
  }
  
  ret = as.character(array.vals)
  
  orfs = unlist( lapply(strsplit(ret, '_'), function(i) i[1]) )
  annots = unlist( lapply(strsplit(ret, '_'), function(i) i[2]) )
  
  if (all(is.na(annots))) {
    annots = orfs
  }
  
  return(c(orfs, annots))
}

# Get metadata encoded in file name
# Format should be: username_query_arrayplateid...
# @param file.name: character file name
# @return ret: list of all meta data encoded in file name
fileNameMetadata <- function(file.name){
  split.pat =  '_|\\.'
  
  sp = strsplit(file.name,split.pat)[[1]]
  
  # Regular expression query of control screens
  ctrl.pat = 'wt|ctrl'
  # Regular expression for digit
  digit.pat = '^\\d+$'
  
  ret = NULL
  
  # Set file name - replace any spaces by a hyphen
  ret$filename = sub('\\s', '-', file.name)
  
  # Set user name - replace any spaces with a hyphen
  ret$username = sub('\\s', '-', sp[1])
  
  # Set query name
  ret$query = sub('\\s', '-', sp[3])
  
  # Set is.control
  ret$is.control = grepl(ctrl.pat, sp[2], ignore.case=T)
  
  # Set array plate id
  if(grepl(digit.pat, sp[4])){
    # Valid array plate id
    ret$arrayplateid = as.numeric(sp[4])
  }else{
    # Invalid array plate id
    ret$arrayplateid = NA
  }
  
  # Set is valid only if we have no NAs 
  ret$is.valid = !any(is.na(c(ret$query, ret$arrayplateid)))
  
  return(ret)
}

####################################################################################
# Normalization/scoring section

# Normalize a plate
# @params overall.plate.median: value of overall plate median from large-scale experiments
# @params max.colony.size: computed from plate median, used to filter large colonies
# @return data frame: normalized data frame
# The normalizeSGA function was edited from the original
normalizeSGA <- function(
        plate.data,
        overall.plate.median = 510,
        max.colony.size = 3*overall.plate.median, # Changed this to 3 to keep everything. Threshold filter is done later.
        field.to.normalize = 'size'
        ){
  
  loginfo('Normalizing plate: overall.plate.median = %d, max.colony.size = %d, nROW = %d, nCOL = %d', 
          overall.plate.median, max.colony.size, nROW, nCOL)

  # Edited so it is filled in at the start of the function with nROW and nCOL from the global environment.
  num.rows = nROW
  num.cols = nCOL

  # Ignored rows (not used as we are not filtering; kept for ease of the rest of the code)
  ignore.ind = rep(FALSE, nrow(plate.data))
  names(ignore.ind) = NA
 
  ########## (N1) Plate normalization ##########
  plate.data$pnorm = plateNormalization(plate.data, field.to.normalize, overall.plate.median)

  ########## (N2) Spatial normalization ##########
  # The spatialNormalization function was edited from the original
  plate.data$snorm = spatialNormalization(plate.data, 'pnorm', ignore.ind)

  ########## (N3) Row column effect normalization ##########
  # The rowcolNormalization function was edited from the original
  plate.data$rcnorm = rowcolNormalization(plate.data, 'snorm', ignore.ind)

  ########## (N4) Plate normalization 2 ##########
  plate.data$pnorm2 = plateNormalization(plate.data, 'rcnorm', overall.plate.median)
  
  # The last section on normalizing to the overall plate median is removed.
  
  return(plate.data)
}

# Score plates
# @params plate.data.list: list of data frames (plate data)
# @params scoring.function: 1 for Cij-CiCj, 2 for Cij/CiCj
# @return list: list of plate files, scored
scoreSGA <- function(plate.data.list, scoring.function=1){
  
  # Merge list info
  merged.dat = do.call('rbind', plate.data.list)
  
  metadata.table = lapply(plate.data.list, function(plate.data){
    d = attr(plate.data, 'file.name.metadata')
    as.data.frame(d)
  })
  metadata.table = do.call('rbind', metadata.table)[,4:6]
  
  # If we dont have any control/dm plates, we cant do scoring
  if(sum(metadata.table$is.control) < 1 | sum(!metadata.table$is.control) < 1)
    return(plate.data.list)
  
  # Do scoring for different array plate ids separatley 
  for(arrayplateid in unique(metadata.table$arrayplateid)){
    
    # Check if this array plate id has controls. if not, dont do anything
    is.ctrl = metadata.table$is.control & metadata.table$arrayplateid == arrayplateid
    is.dm = !metadata.table$is.control & metadata.table$arrayplateid == arrayplateid
    
    if(sum(is.ctrl) < 1 | sum(is.dm) < 1)
      next
    
    merged.dat.ctrl = do.call('rbind', plate.data.list[is.ctrl])
    merged.dat.dm = do.call('rbind', plate.data.list[is.dm])
    
    # We have at least one query, proceed to score
    # Get single mutant fitness of arrays (non control plates) - computed as median of the plate
    querys = unique(merged.dat.dm$query)
    query.smf = sapply(querys, function(curr.query){
      median( merged.dat.dm$ncolonysize[merged.dat.dm$query == curr.query] , na.rm=T)
    })
    
    # Get array smf from control plates
    arrays = unique(merged.dat.ctrl$array_annot)
    array.smf = sapply(arrays, function(curr.array){
      median( merged.dat.ctrl$ncolonysize[ merged.dat.ctrl$array_annot == curr.array ], na.rm=T )
    })
    
    # Use default overall median or median from plates?
    # Default overall median
    overall.median = 510
    # Get 60% middle median - R automatically removes NA values
    vals = sort(merged.dat$ncolonysize)
    length = length(vals)
    lower = 0.2
    upper = 0.8
    middle.median = median( vals[round(lower*length):round(upper*length)], na.rm = T)
    
    # Do the scoring
    plate.data.list[is.dm] = lapply(plate.data.list[is.dm], function(plate.data){
      # Single mutant fitnesses
      q.smf = query.smf[plate.data$query] / middle.median
      a.smf = array.smf[plate.data$array_annot] / middle.median
      # Double mutant fitness
      dm = plate.data$ncolonysize / middle.median
      
      # Score accourding to scoring function
      if(scoring.function == 1){
        plate.data$score = dm - (q.smf * a.smf)
      }else if(scoring.function == 2){
        plate.data$score = dm / (q.smf * a.smf)
      }
      
      plate.data$ctrlncolonysize = plate.data.list[is.ctrl][[1]]$ncolonysize;
      
      plate.data
    })
    
  }# End array plate id loop
  
  # Done scoring, return the data
  return(plate.data.list)
}

# Merges names of logical vectors, collapsing using a comma
# @param a: logical vector 1
# @param b: logical vector 2
# @return logical: logical vector 1 and 2 merged
mergeLogicalNames <- function(a, b){
  ret = a | b
  sp1 = strsplit( names(a), ',')
  sp2 = strsplit( names(b), ',')
  
  new.nm = sapply(1:length(sp1), function(i){
    u = as.character(union(sp1[[i]], sp2[[i]]))
    u = u[!is.na(u) & !grepl('NA', u)]
    paste(u, collapse=',')
  })
  names(ret) = new.nm
  return(ret)
}

#  Key-value pair methods
# Convert character vector of kvps to a data frame
# @param kvps.string: character verctor of kvps  
# @return data frame: columns are keys, rows are different kvps
kvpsAsDataFrame <- function(kvps.string){
  # Get list of dataframes
  list.df = lapply(kvps.string, function(kvp.string){
    if(!is.na(kvp.string)){
      df = as.data.frame(kvpAsMap(kvp.string))
      t(df)
    }
  })
  # Remove NULL
  list.df = list.df[!sapply(list.df, is.null)]
  
  # If nothing produced we return an empty data frame
  if(length(list.df) == 0) return(as.data.frame(matrix(NA,0,0)))
  
  #Merge keys/value data frames
  merged.df = Reduce(function(...) merge(..., all=T), list.df)
  
  # Rename so everything is uppercase
  names(merged.df) = toupper(names(merged.df))
  return(merged.df)
}

# Convert key-value pair string to key-value map 
# @param kvp.string: key value pair as character 
# @return list: named vector
kvpAsMap <- function(kvp.string){
  # As a vector
  split = strsplit(kvp.string, '\\{|\\}|,\\s*')[[1]]
  split = split[split != ""]
  
  # Create the map
  df = as.data.frame(strsplit(split, '='), stringsAsFactors=F)
  map = as.character(df[2,])
  names(map) = df[1,]
  
  return(map)
}

# Convert key-value pair map to a character string 
# @param kvp.map: key value pair as map 
# @return list: character representation
kvpMapAsString <- function(kvp.map){
  kv = sapply(names(kvp.map), function(key){
    paste0(key, '=', kvp.map[[key]])
  })
  return(paste0('', paste0(kv, collapse=','), ''))
}

# Linkage filter: check if query and array are within close proximity on the same chromosome
# @param plate.data: SGA formatted data frame
# @param linkage.cutoff: in KB, If witin this value of eachother on same chromosome they will be ignored
# @return linkage.ignore: logical array with TRUE for rows to ignore. Status code as name
linkageFilter <- function(plate.data, linkage.cutoff=200, linkage.file='', linkage.genes=''){
  
  loginfo('# Applying linkage filter, linkage.cutoff = %d', linkage.cutoff)
  
  loginfo('Linkage file is at: %s', linkage.file)
  status.code = 'LK'
  
  # Load linkage files named: chromosome_coordinates.Rdata(R.data?)
  if(file.exists(linkage.file)){
    loginfo('Loading chromosome coordinates file')
    load('data/chrom_coordinates.Rdata')
  }else{
    loginfo('Chromosome coordinates file does not exist, returning empty data frame')
    chrom_coordinates = as.data.frame(matrix(NA, 0, 4))
  }
  
  if(linkage.cutoff < 0){
    loginfo('Skipping linkage correction...')
    linked = rep(FALSE, nrow(plate.data))
    names(linked) = NA
    return(linked)
  }
  
  
  mid.map = apply(chrom_coordinates[,3:4], 1, mean)
  names(mid.map) = chrom_coordinates[[1]]
  
  chr.map = chrom_coordinates[[2]]
  names(chr.map) = chrom_coordinates[[1]]
  
  linkage.genes = unique(c(plate.data$query, linkage.genes))
  loginfo('# Linkage genes including query = %s', paste0(linkage.genes, collapse=', '))
  linkage.genes = linkage.genes[ linkage.genes %in% chrom_coordinates[[1]] ]
  loginfo('# Linkage genes found in coords table = %s', paste0(linkage.genes, collapse=', '))
  
  # Get indicies for which row:query/array on same chromsome and within < cutoff
  #ind = plate.data$array %in% chrom_coordinates[[1]] 
  ar = plate.data$array
  
  linked = sapply(linkage.genes, function(g){
    (chr.map[g] == chr.map[ar]) & (abs( mid.map[g] - mid.map[ar] ) < (linkage.cutoff * 1e3))
  })
  
  linked = apply(linked, 1, any)
  linked[is.na(linked)] = FALSE
  
#   linked = sapply(plate.data$array, function(ar){
#     if(length(linkage.genes) == 0 | ! ar %in% chrom_coordinates[[1]] | linkage.genes[1] == ''){
#       FALSE
#     }else{
#       t = sapply(linkage.genes, function(g){
#         (chr.map[g] == chr.map[ar]) & (abs( mid.map[g] - mid.map[ar] ) < (linkage.cutoff * 1e3))
#       })
#       if(any(is.na(t) | is.null(t))){
#         FALSE
#       }else{
#         any(t)
#       }
#     }
#   })
  
  # Set status code
  names(linked) = NA
  ind = which(linked)
  names(linked)[ind] = rep(status.code, length(ind))
  
  loginfo('Linkage filter applied, total ignored = %d',sum(linked))
  return(linked)
}

# Plate normalization: brings all plates to one same scale
# @param plate.data: SGA formatted data frame
# @param field.to.normalize: name of the column in the data to normalize
# @return: vector of normalized values
plateNormalization <- function(plate.data, field.to.normalize, default.overall.median) {
  
  loginfo('# Normalizing for plate effect, default.overall.median = %d', default.overall.median)
  #Used to get median of center 60% of colonies (change if needed)
  lower = 0.2
  upper = 0.8
  
  # Get and sort our data to be plate normalized
  vals = sort(plate.data[[field.to.normalize]])
  vals.length = length(vals)
  
  # If we have insufficient data - return NAs
  if(length(vals) < 10){
    loginfo('Insufficient data for plate normalization, returning')
    return( rep(NA, nrow(plate.data)) )
  }
  
  # We have sufficient data - get median of center 60% of colonies
  plate.median = median(vals[round(lower*vals.length) : round(upper*vals.length)], na.rm = TRUE)
  
  if (plate.median == 0) {
    loginfo('Median is 0, taking median of all')
    plate.median = mean(vals, na.rm = TRUE)
    loginfo(paste('New median is', plate.median))
  }
  
  # Store the final result computed using all data in result array
  normalized = plate.data[[field.to.normalize]] * (default.overall.median / plate.median)
  
  loginfo('Done plate normalization')
  
  # Return final result
  return(normalized)
}

# Spatial normalization: normalizes any gradient effect on the plate via median smoothing
# @param plate.data: SGA formatted data frame
# @param field.to.normalize: name of the column in the data to normalize
# @param ignore.ind: logical for any rows to be ignored 
# @return: vector of normalized values
# The spatialNormalization function was edited from the original
spatialNormalization <- function(plate.data, field.to.normalize, ignore.ind) {

  loginfo('# Normalizing for spatial effect')

  # Edited so it is filled in at the start of the function with nROW and nCOL from the global environment.
  num.rows = nROW
  num.cols = nCOL
  
  # Get gaussian/average filters
  gaussian.filt = fgaussian(7, 2) # smaller number, better resolution between neighbouring colonies
  average.filt = faverage(9)
  
  # Data to be normalized before ignored
  before.ignore = plate.data[[field.to.normalize]]
  
  # Data to be normalized after ignored (used in the analysis)
  after.ignore = before.ignore
  
  # Construct plate matrix
  plate.mat = matrix(NA, num.rows, num.cols)
  rc.mat = as.matrix(plate.data[,1:2])
  plate.mat[rc.mat] = after.ignore
  
  # Fill NA with a placeholder (mean of all colonies) 
  t = plate.mat
  ind.na = which(is.na(t))
  t[ind.na] = mean(plate.mat, na.rm = TRUE)
  
  # Fill in NA with smoothed version of neighbors using gaussian blur
  filt.g = applyfilter(t, gaussian.filt)
  t[ind.na] = filt.g[ind.na]
  
  # Apply median/average filters
  # Padding type "replicate" is to copy nearest cells and this way it makes the border colonies have more large sized colonies on an outer "fake" border to normalise it.
  filtered = medianfilter2d(t, 7, padding_type = 'replicate')
  filtered = applyfilter(filtered, average.filt, 'replicate')
  
  # Subtract the mean of the filtered data from the filtered data
  f = filtered / mean(filtered)
  
  # Subtract filtered - mean from  
  before.ignore = before.ignore / f[rc.mat]
  
  return(before.ignore)
}

# Row column effect normalization
# @param plate.data: SGA formatted data frame
# @param field.to.normalize: name of the column in the data to normalize
# @param ignore.ind: logical for any rows to be ignored 
# @return: vector of normalized values
# The rowcolNormalization function was edited from the original
rowcolNormalization <- function(plate.data, field.to.normalize, ignore.ind) {
  
  loginfo('# Normalizing for row column effect')

  # Data before rows ignored
  data.before.ignore = plate.data[[field.to.normalize]]
  
  # Ignore these rows
  data.after.ignore = data.before.ignore
  
  # Edited so it is filled in at the start of the function with nROW and nCOL from the global environment.
  num.rows = nROW
  num.cols = nCOL
  
  # Smooth across columns
  # The rowcolNormalizationHelper function was edited from the original
  col.data = plate.data$col
  norm.col.effect = rowcolNormalizationHelper(col.data, data.after.ignore, num.rows, num.cols)
  
  # Smooth across rows
  # The rowcolNormalizationHelper function was edited from the original
  norm.col.effect[ignore.ind] = NA
  row.data = plate.data$row
  norm.rowcol.effect = rowcolNormalizationHelper(row.data, norm.col.effect, num.rows, num.cols)
  
  return(norm.rowcol.effect)
}

# Helper for row/column effect normalization to avoid redundancy 
# @param rowcol.data: values of the row/col column in the plate data frame
# @param colony.size.data: values of the colony sizes being normalized 
# @return: vector of normalized values
# The rowcolNormalizationHelper function was edited from the original
rowcolNormalizationHelper <- function(rowcol.data, colony.size.data, num.rows, num.cols){
  
  ind.na = is.na(colony.size.data)
  
  # Sort values with index return
  vals.sorted = sort(rowcol.data[!ind.na], index.return = TRUE)
  ind.sorted = vals.sorted[[2]]
  vals.sorted = vals.sorted[[1]]
  
  # Original code start
  # Window size to be used in lowess smoothing - currently not using it
  #span = sum(vals.sorted <= 6) / (num.rows*num.cols)
  
  #if(span>0 & length(span) > 0){
  #lowess_smoothed = lowess(rowcol.data[!ind.na][ind.sorted], colony.size.data[!ind.na][ind.sorted], f=0.09, iter=5)
  #}else{
  #  lowess_smoothed = list(y=colony.size.data[!ind.na][ind.sorted])
  #}
  # Original code end
  
  #Modified
  lowess_smoothed = lowess(rowcol.data[!ind.na][ind.sorted], colony.size.data[!ind.na][ind.sorted], f = 0.09, iter = 5)
  
  #      pdf(sprintf('~/Desktop/lowess_%s_%s.pdf', max(rowcol.data, na.rm=T), i), width=18, height=18)
  #      x = rowcol.data[!ind.na][ind.sorted]
  #      y = colony.size.data[!ind.na][ind.sorted]
  #      plot(x,y)
  #      lines(lowess_smoothed, col='green')
  #      lx = lowess_smoothed$x
  
  # We only care about Y values (colony size)
  lowess_smoothed = lowess_smoothed[['y']]
  
  tmp = lowess_smoothed / mean(lowess_smoothed)
  tmp[is.nan(tmp)] = 1
  colony.size.data[!ind.na][ind.sorted] = colony.size.data[!ind.na][ind.sorted] / tmp;
  colony.size.data[is.infinite(colony.size.data)] = 0
  
  #    lines(x=lx, tmp, col='red')
  #    points(x=lx, colony.size.data[!ind.na][ind.sorted] / tmp, col='blue', pch=3)
  #    dev.off()
  
  # Fill ignored NA values
  ind.uniq.rc = which(duplicated(rowcol.data))
  rc.to.smoothed = lowess_smoothed[ind.uniq.rc]
  names(rc.to.smoothed) = rowcol.data[ind.uniq.rc]
  
  ind.na = which(ind.na)
  
  na.smoothed = sapply(ind.na, function(i){
    i.rc = as.character(rowcol.data[i]) 
    
    i.smoothed = rc.to.smoothed[i.rc]
    
    i.smoothed / mean(lowess_smoothed)
  })
  na.smoothed = unlist(na.smoothed)
  
  # Update NAs
  colony.size.data[ind.na] = colony.size.data[ind.na] / as.vector(na.smoothed)
  
  return(colony.size.data)
}

# Jackknife filter (LOOCV): checks for colonies that contribute more than 90% of total variance in their replicates
# @param plate.data: SGA formatted data frame
# @param field.to.filter: name of the column in the data to filter
# @return jk.ignore.logical: logical array with TRUE for rows to ignore. Status code as name
jackknifeFilter <- function(plate.data, field.to.filter){
  
  loginfo('# Applying jackknife filter')
  
  # Status code of filter
  status.code = 'JK'
  
  # Get all unique queries
  uniq.query = unique(plate.data$query)
  
  # Get all unique arrays
  uniq.array = unique(plate.data$array)
  uniq.spots = unique(plate.data$spots)
  
  # Remove HIS3 from arrays
  uniq.array = uniq.array[ ! grepl('YOR202W', uniq.array, ignore.case=T) ]
  uniq.spots = uniq.spots[ ! grepl('YOR202W', uniq.array, ignore.case=T) ]
  
  # Function used for jackknife function (sd, ignoring NA)
  theta <- function(x){sd(x, na.rm=T)}
  
  jk.ignore = lapply(uniq.spots, function(curr.spot){
    
    # Ignore HIS3 from arrays
    # if( grepl('YOR202W', curr.array, ignore.case=T) ) NULL
    
    curr.ind = which(plate.data$spot == curr.spot)
    # Get indices of our current array
    vals = plate.data[[field.to.filter]][curr.ind]
    # Get NA values
    ind.na = is.na(vals)
    
    # If we dont have enough non-NA values
    if( sum(!is.na(vals))  < 2 ) NULL
    
    # Get jackknife variances 
    jk.vals = jackknife(vals, theta)$jack.values
    
    # Get total variance and jackknife variances
    total.var = var(vals, na.rm=T) * (length(vals)-1)
    jk.var =  (jk.vals^2) * (length(jk.vals)-2)
    
    # Find colonies that contribute more than 90% of total variance
    t = which( (total.var - jk.var) >  (0.9*total.var) )
    curr.ind[t]
  })
  
  # These are the indicies to be ignored
  jk.ignore.ind = unlist(jk.ignore[! sapply(jk.ignore, is.null)])
  
  # Turn into logical values i.e. if we had indicies 1, 3 to be ignored, 
  # it will be converted to TRUE FALSE TRUE FALSE FALSE ......
  jk.ignore.logical = 1:nrow(plate.data) %in% jk.ignore.ind
  
  # Add status code JK - jackknife failed
  names(jk.ignore.logical) = NA
  names(jk.ignore.logical)[jk.ignore.ind] = rep(status.code, length(jk.ignore.ind))
  
  loginfo('Done jackknife filter, total ignored = %d', sum(jk.ignore.logical))
  return(jk.ignore.logical)
}

# Big replicates filter: remove excessively large colonies replicates
# @param plate.data: SGA formatted data frame
# @param field.to.filter: name of the column in the data to filter
# @param max.colony.size: "large" colony threshhold, usually 1.5 * median of plate
# @return big.logical: logical array with TRUE for rows to ignore. Status code as name
bigReplicatesFilter <- function(plate.data, field.to.filter, max.colony.size, ignore.ind){
  
  loginfo('# Applying big replicates filter, max.colony.size = %d', max.colony.size)
  # Status code of filter
  status.code = 'BG'
  
  #Find indicies of large colonies
  large.ind = which(plate.data[[field.to.filter]] >= max.colony.size)
  
  # Gets spots of large colonies, and returns the count of each spot
  big.spots = table(plate.data$spots[large.ind])
  
  # Get colonies such that their spot contains at least 3 big colonies
  big.spots = big.spots[big.spots >= 3]
  spots.to.remove = names(big.spots)
 
  # Get which colonies are in spots to remove
  big.ind = which(plate.data$spots %in% spots.to.remove)
  
  big.logical = 1:nrow(plate.data) %in% big.ind
  
  # Set status code BG BIG REP?
  names(big.logical) = NA
  names(big.logical)[big.ind] =  rep(status.code, length(big.ind))
  
  loginfo('Done big replicates filter, total ignored = %d', sum(big.logical))
  return(big.logical)
}


####################################################################################
# Filter functions used in spatial normalization: rewritten from matlab for R 

# Returns a gaussian filter matrix with dimensions x by x: equal to fspecial function in matlab
# Inputs:
# x = dimensions (number of rows/cols) of the returned gaussian filter
#   sigma = standard deviation
# The fgaussian function was edited from the original
fgaussian <- function(x, sigma){
  x = SGAseq(x)
  mat = matrix(NA, length(x),length(x));
  for(i in 1:length(x)){
    for(j in 1:length(x)){
      n1 = x[i];
      n2 = x[j];
      mat[i,j] = exp(-(n1^2+n2^2)/(2*sigma^2));
    }
  }
  mat = mat/sum(mat)
  return(mat)
}


# Average filter
faverage <- function(size){
  x = 1/(size*size)
  ret = matrix(rep(x, size*size), size,size)
  return(ret)
}

# Helper function for fgaussian - given some value x, returns a gradient array begining with 0 on the inside and increasing outwards. Example: x = 7 returns [3,2,1,0,1,2,3] 
# Inputs:
#   x = number of elements in the returned array
# The seq function was edited from the original
SGAseq <- function(x){
  n = x;
  x = c(1:x)
  if(n%%2){
    rhs = x[1:floor(length(x)/2)];
    lhs = rev(rhs);
    return(c(lhs,0,rhs))
  }else{
    rhs = x[1:floor(length(x)/2)] - 0.5;
    lhs = rev(rhs);
    return(c(lhs,rhs))
  }
}

# Applies a filter to a matrix: see imfilter (matlab) with replicate option
# Inputs:
#   mat = matrix to which the filter is applied
# filter = a square matrix filter to be applied to the matrix 
applyfilter <- function(mat, filter, padding_type = 'zeros'){
  mat2 = mat
  fs = dim(filter);
  if(fs[1] != fs[2])
    stop('Filter must be a square matrix')
  if(fs[1] %% 2 == 0)
    stop('Filter dimensions must be odd')
  if(fs[1] == 1)
    stop('Filter dimensions must be greater than one')
  
  x = fs[1];
  a = (x-1)/2;
  
  s = dim(mat2)
  r = matrix(0, s[1], s[2])
  
  start = 1+a;
  end_1 = s[1]+a;
  end_2 = s[2]+a;
  
  mat2 = padmatrix(mat, a, padding_type)
  
  for(i in start:end_1){
    for(j in start:end_2){
      temp = mat2[(i-a):(i+a), (j-a):(j+a)] * filter;
      r[(i-a),(j-a)] = sum(temp)
    }
  }
  return(r)
}

# Applies a filter to a matrix: see imfilter (matlab) with replicate option
# Inputs:
#   mat = matrix to which the filter is applied
# dim = number of rows/cols of window
medianfilter2d <- function(mat, dim, padding_type = 'zeros'){
  mat2 = mat
  fs = c()
  fs[1] = dim
  fs[2] = dim
  
  if(fs[1] != fs[2])
    stop('Filter must be a square matrix')
  if(fs[1] %% 2 == 0)
    stop('Filter dimensions must be odd')
  if(fs[1] == 1)
    stop('Filter dimensions must be greater than one')
  
  x = fs[1];
  a = (x-1)/2;
  
  s = dim(mat2)
  r = matrix(0, s[1], s[2])
  
  start = 1+a;
  end_1 = s[1]+a;
  end_2 = s[2]+a;
  
  mat2 = padmatrix(mat, a, padding_type)
  
  for(i in start:end_1){
    for(j in start:end_2){
      temp = mat2[(i-a):(i+a), (j-a):(j+a)];
      r[(i-a),(j-a)] = median(temp)
    }
  }
  return(r)
}

# Adds a padding to some matrix mat such that the padding is equal to the value of the nearest cell
# Inputs:
#   mat = matrix to which the padding is added
#   lvl = number of levels (rows/columns) of padding to be added
#   padding = type of padding on the matrix, zero will put zeros as borders, replicate will put the value of the nearest cell
padmatrix <- function(mat, lvl, padding){
  s = dim(mat);
  row_up = mat[1,]
  row_down = mat[s[1],]
  
  if(padding == 'zeros'){
    row_up = rep(0, length(row_up))
    row_down = rep(0, length(row_down))
  }
  #Add upper replicates
  ret = t(matrix(rep(row_up, lvl), length(as.vector(row_up))))
  #Add matrix itself
  ret = rbind(ret, mat)
  #Add lower replicates
  ret = rbind(ret, t(matrix(rep(row_down, lvl), length(as.vector(row_down)))))
  
  #Add columns
  s = dim(ret);
  col_left = ret[,1]
  col_right = ret[,s[2]]
  
  if(padding == 'zeros'){
    col_left = rep(0, length(col_left))
    col_right = rep(0, length(col_right))
  }
  
  #Add left columns
  ret2 = matrix(rep(col_left, lvl), length(as.vector(col_left)))
  #Add matrix itself
  ret2 = cbind(ret2, ret)
  #Add right columns
  ret2 = cbind(ret2, matrix(rep(col_right, lvl), length(as.vector(col_right))))
  
  #return 
  return(ret2)
}

3 Experiment Description

The screen is conducted in 1536-colony format by crossing a query plate with the yeast knockout (KO) library consisting of 85 plates as described in Baryshnikova (2010).

The query plate consists of 3 query strains and a wild type control arranged in 4x4 groups as illustrated below.

Query plate
WT G1-RNH202 WT G1-RNH202
S-RNH202 G2/M-RNH202 S-RNH202 G2/M-RNH202
WT G1-RNH202 WT G1-RNH202
S-RNH202 G2/M-RNH202 S-RNH202 G2/M-RNH202

Each strain from the KO library is replicated 16 times in a 4x4 group that is crossed with 4 technical replicates of each query and wild type. All plates contain dummy strains for the 4 outermost rows and columns of the 1536-colony format, which will not be used for analysis.

After selection of haploid double mutants, the colonies were pinned onto recovery plates, allowed to grow for 2 days and photographed.

The photographs were preprocessed in Adobe Photoshop: cropped and converted to grayscale, followed by adjustment of exposure to 0.1 and gamma correction to 0.4. The preprocessed images were then segmented to determine colony size using the ‘gitter’ package (Omar Wagih and Leopold Parts, 2015) with segmentation parameters set as below:

  1. plate.format = 1536

  2. contrast = NULL

  3. remove.noise = T.

Note:

  1. The ‘gitter’ package has been archived from CRAN but users can still install it from source using the following URL: https://cran.r-project.org/src/contrib/Archive/gitter/gitter_1.1.1.tar.gz

  2. Users will need to install R Tools (https://cran.r-project.org/bin/windows/Rtools/) before being able to install from source.

  3. Further installation notes can be found at http://omarwagih.github.io/gitter/

Examples of the raw, preprocessed, and segmented images are shown below:

Example of raw photograph

Example of raw photograph

Example of preprocessed photograph

Example of preprocessed photograph

Example of segmented gridded photograph

Example of segmented gridded photograph

4 Data Processing

The analysis begins with loading the colony size data obtained from image segmentation.

# Load the colony size data from photograph segmentation
load("20201023_photoData_Final.rda")
head(photoData_Final)
##   row col size circularity flags row1536 plate well
## 1   1   1   90      1.0559  <NA>       A    13  A01
## 2   1   2  644      0.9473  <NA>       A    13  A02
## 3   1   3   64      1.0591  <NA>       A    13  A03
## 4   1   4  477      0.9661  <NA>       A    13  A04
## 5   1   5   36      1.1424  <NA>       A    13  A05
## 6   1   6  473      0.9439  <NA>       A    13  A06

4.1 Remove Empty Spots and Border Colonies

First, we look at the distribution of the colony sizes of the complete dataset.

binvalue <- 250

ggplot(data = photoData_Final) + 
  geom_histogram(aes(size), bins = binvalue) +
  scale_x_continuous(breaks = seq(-100, 3500, by = 100)) +
  labs(x = "Colony Size (Raw)",
       title = "Raw colony size (complete dataset)")

The colony sizes which are 0 are changed to NA as they are already known to be empty positions. Then, border colonies are marked, and the colony sizes are also set to NA.

# Modify the border colonies table
BorderColonies2 <- BorderColonies %>%
  mutate(IsBorder = TRUE)

# Combining the data table with the border colonies list to mark border and
# non-border colonies
photoData_FinalwoBorder <- left_join(photoData_Final, BorderColonies2,
                                     by = "well")
photoData_FinalwoBorder$IsBorder[is.na(photoData_FinalwoBorder$IsBorder)] <- FALSE

# Marking all empty positions
photoData_FinalwoBorder$IsEmpty[photoData_FinalwoBorder$size == 0] <- TRUE
photoData_FinalwoBorder$IsEmpty[photoData_FinalwoBorder$size != 0] <- FALSE

# Set the empty positions and border colonies size as NA
photoData_FinalwoBorder$size[photoData_FinalwoBorder$size == 0] <- NA
photoData_FinalwoBorder$size[photoData_FinalwoBorder$IsBorder == TRUE] <- NA

The raw colony sizes are shown again below with the empty positions and border colonies removed.

ggplot(data = photoData_FinalwoBorder) + 
  geom_histogram(aes(size), bins = binvalue) +
  scale_x_continuous(breaks = seq(-100, 3500, by = 100)) +
  labs(x = "Colony Size (Raw)",
       title = "Raw colony size",
       subtitle = "Empty positions and border colonies are removed.")

Here, we inspect the data by query and plate. For this purpose, the raw data table is joined to the annotation data for ease of grouping or filtering. The annotation data for the screen was created beforehand and loaded.

# Load the annotation data
load("20201023_KOanno.rda")

KOanno$well <- paste0(KOanno$row_1536, sprintf("%02d", KOanno$col_1536))
KOanno$plate <- KOanno$plate_1536
KOanno <- as.tbl(KOanno)

First, we look at the wild type and each query strain separately.

# Join photoData table with the annotation table.
# Then, filter off positions that do not have an ORF in the KO library.
df_temp_FinalwoBorder_Anno <- left_join(photoData_FinalwoBorder, KOanno,
                                        by = c("plate", "well")) %>%
  filter(!is.na(ORF))

# Histogram
ggplot(data = df_temp_FinalwoBorder_Anno) +
  geom_histogram(aes(size), bins = 100) +
  facet_wrap(aes(Query)) +
  labs(x = "Colony Size (Raw)",
       y = "Count",
       title = "Raw colony size by query",
       subtitle = "Empty positions and border colonies are removed.")

The average colnoy size for the 3 queries appears to be smaller than the wild type control. This trend is confirmed by comparing colony sizes on each plate.

ggplot(data = (df_temp_FinalwoBorder_Anno %>%
                           select(plate, well, size, circularity, ORF, Query))) +
    geom_boxplot(aes(Query, size)) +
    facet_wrap(aes(plate)) +
    theme(axis.text.x = element_text(angle = -45)) +
    labs(x = "Query",
         y = "Colony Size (Raw)",
         title = "Raw colony size by plate",
         subtitle = "Empty positions and border colonies are removed.")

This analysis also shows that the average size varies from plate to plate

Lastly, we examine the data for spatial effects. The colony sizes for several plates are shown as heatmaps.

grid.arrange(
  heatplot(df_temp_FinalwoBorder_Anno, 15),
  heatplot(df_temp_FinalwoBorder_Anno, 25),
  heatplot(df_temp_FinalwoBorder_Anno, 35),
  heatplot(df_temp_FinalwoBorder_Anno, 45),
  heatplot(df_temp_FinalwoBorder_Anno, 55),
  heatplot(df_temp_FinalwoBorder_Anno, 65),
  heatplot(df_temp_FinalwoBorder_Anno, 75),
  heatplot(df_temp_FinalwoBorder_Anno, 85),
  heatplot(df_temp_FinalwoBorder_Anno, 95),
  ncol = 3)

4.2 Data Correction and Normalization

Here we use SGA Tools (Omar Wagih et al., 2013) to correct the data for plate and spatial effects as follows:

  1. plateNormalization 1 (pnorm): Normalises colony sizes to the plate median per plate.

  2. spatialNormalization (snorm): Normalizes pnorm colony sizes on each plate using a gaussian filter to normalize for any gradient effect on the plate via median smoothing.

  3. rowcolNormalization (rcnorm): Normalizes snorm colony sizes on each plate by column then by row using lowess smoothing.

  4. plateNormalization 2 (pnorm2): Re-normalises rcnorm colony sizes to the plate median of all plates.

4.2.1 Data Normalization

The colony size is first normalized to the median per query type per plate by subtraction. Then, the SGA Tools normalizeSGA function is executed. The median of each query per plate is then added back to the normalized colony sizes.

The first median normalization is to avoid overcorrection with the SGA normalization procedure due to the difference in average colony size between queries and wild type.

buffer_value <- 1000

# First, we append the photoData tables with the annotation table.
photoData_FinalwoBorder_Anno <- left_join(photoData_FinalwoBorder,
                                          KOanno,
                                          by = c("plate", "well")) %>%
  # Calculate the median of each query by plate and attach it to the main table.
  do({
    tempA <- . 
    tempB <- tempA %>%
      group_by(plate, Query) %>%
      summarise(platequery_median = median(size, na.rm = T))
    tempC <- left_join(tempA, tempB, by = c("plate", "Query"))
    tempC
    }) %>%
  # Calculate the median adjusted size and added 1000 to avoid the negative
  # numbers which causes problems to SGA normalization.
  # There are no problems with spots that were already marked with NA.
  mutate(median_adj_size = size - platequery_median + buffer_value) %>%
  ungroup()
# Plates description
nPlates <- 85
nROW <- 32
nCOL <- 48

# Overall experiment plate median is set to be able to compare between different
# conditions/plates.
# This value is now 1000 due to the buffer value added to all colonies above.
KOPM <- median(photoData_FinalwoBorder_Anno$median_adj_size, na.rm = T)

The normalization is done for 85 plates, with 32 rows and 48 columns in a 1536 format.

# Normalization of the data based on median adjusted size
DataInput <- photoData_FinalwoBorder_Anno

# Temporary table to hold the data for normalization with just the column names
retA <- data.frame(DataInput[0, ])

# Normalize the remaining plates.
for(i in 13:(12+nPlates)){
  retB <- DataInput %>% filter(plate == i)
  
  retC <- normalizeSGA(plate.data = retB, overall.plate.median = KOPM,
                       field.to.normalize = "median_adj_size")
  
  retA <- rbind(retA, retC)
}

# Return the normalized data from the temporary table to a permanent table.
photoData_Final_Normalized <- retA %>%
  # Add back the removed median
  mutate(pnorm3 = pnorm2 + platequery_median - buffer_value) %>%
  # Set anything that drops below zero as NA
  # These are colonies that are very small initially (<4SD below median)
  do({
    tempA <- .
    tempA$pnorm3[tempA$pnorm3 < 0] <- NA
    tempA
  }) %>%
  group_by(plate) %>%
  # Adjust the values according to the median of the plate
  mutate(platemedian1 = median(pnorm3, na.rm = T),
         pnorm4 = pnorm3/platemedian1) %>%
  ungroup() %>%
  do({
    tempA <- .
    # Change the NAs to zero for the jackknife function.
    tempA$pnorm4[is.na(tempA$pnorm4)] <- 0
    tempB <- tempA %>%
      # Performing jackknife marking here. It checks for colonies that contribute
      # more than 90% of total variance in their replicates and marks as TRUE
      group_by(ORF, Query) %>%
      mutate(JK = jackknife(pnorm4, theta)$jack.values,
             total.var = var(pnorm4, na.rm=T) * (length(pnorm4)-1),
             jk.var =  (JK^2) * (length(JK)-2),
             JK.TRUE = (total.var - jk.var) > (0.9*total.var)) %>%
      ungroup()
    # Change the zeros back to NA
    tempB$pnorm4[tempB$pnorm4 == 0] <- NA
    tempB
  }) %>%
  ungroup()

4.2.2 Illustrating Normalization Outcome

Colony sizes are shown here by plate with boxplots to illustrate how the normalization affects the data for each step.

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), size)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Raw colony size") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), median_adj_size)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Colony size (Median subtracted by query per plate)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), pnorm)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Colony size (SGA Tools - pnorm)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), snorm)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Colony size (SGA Tools - snorm)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), rcnorm)) +
  labs(x = "Plate", y = "Colony Size",
       title = "Colony size (SGA Tools - rcnorm)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), pnorm2)) +
  labs(x = "Plate", y = "Colony Size",
       title = "Colony size (SGA Tools - pnorm2)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), pnorm3)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Colony size (Median re-added by query per plate)") +
  theme(axis.text.x = element_text(angle = -45))

ggplot(data = photoData_Final_Normalized) + 
  geom_boxplot(aes(as.factor(plate), pnorm4)) +
  labs(x = "Plate", y = "Colony Size", 
       title = "Colony size (Normalized by plate median)") +
  theme(axis.text.x = element_text(angle = -45))

Some plates are plotted as heatmaps to illustrate the changes made by SGA Tools normalization.

The heatmaps below show the plates that were median subtracted by query, post-SGA Tools normalization, and the difference made by SGA Tools normalization.

# Filter off positions that do not have an ORF in the KO library.
df_temp_Final_Normalized <- photoData_Final_Normalized %>%
  filter(!is.na(ORF))

# Heatmaps
p_number <- 25
grid.arrange(
  heatplot(df_temp_Final_Normalized, p_number, "median_adj_size",
           "(Median \nsubtracted by \nquery per \nplate)",
           lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  heatplot(df_temp_Final_Normalized, p_number, "pnorm2",
           "pnorm2", lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  ggplot() +
    geom_tile(data = df_temp_Final_Normalized %>%
                filter(plate == p_number) %>%
                mutate(difference = pnorm2 - median_adj_size),
              aes(as.factor(col), reorder(as.factor(row), -row), fill = difference)) +
    scale_fill_gradientn(name = "Difference", na.value = "white",
                         colours = c("lightblue1", "cyan", "black", "yellow3",
                                     "yellow")
                         ) +
    coord_fixed(0.8) +
    theme_light(base_size = 26) + theme(panel.grid.major = element_blank(),
                          panel.grid.minor = element_blank()) +
    scale_y_discrete(limits = as.character(rev(seq(5, 28, 1))),
                     breaks = as.character(seq(5, 28, 1))) +
    scale_x_discrete(limits = as.character((seq(5, 44, 1))),
                     breaks = as.character(seq(5, 44, 1))) +
    labs(x = "Column", y = "Row",
         subtitle = paste("Plate", p_number))
  ,
  ncol = 1)

p_number <- 35
grid.arrange(
  heatplot(df_temp_Final_Normalized, p_number, "median_adj_size",
           "(Median \nsubtracted by \nquery per \nplate)",
           lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  heatplot(df_temp_Final_Normalized, p_number, "pnorm2",
           "pnorm2", lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  ggplot() +
    geom_tile(data = df_temp_Final_Normalized %>%
                filter(plate == p_number) %>%
                mutate(difference = pnorm2 - median_adj_size),
              aes(as.factor(col), reorder(as.factor(row), -row), fill = difference)) +
    scale_fill_gradientn(name = "Difference", na.value = "white",
                         colours = c("lightblue1", "cyan", "black", "yellow3",
                                     "yellow")
                         ) +
    coord_fixed(0.8) +
    theme_light(base_size = 26) + theme(panel.grid.major = element_blank(),
                          panel.grid.minor = element_blank()) +
    scale_y_discrete(limits = as.character(rev(seq(5, 28, 1))),
                     breaks = as.character(seq(5, 28, 1))) +
    scale_x_discrete(limits = as.character((seq(5, 44, 1))),
                     breaks = as.character(seq(5, 44, 1))) +
    labs(x = "Column", y = "Row",
         subtitle = paste("Plate", p_number))
  ,
  ncol = 1)

p_number <- 45
grid.arrange(
  heatplot(df_temp_Final_Normalized, p_number, "median_adj_size",
           "(Median \nsubtracted by \nquery per \nplate)",
           lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  heatplot(df_temp_Final_Normalized, p_number, "pnorm2",
           "pnorm2", lowerlimit = 300, midpoint = 900, upperlimit = 1500),
  ggplot() +
    geom_tile(data = df_temp_Final_Normalized %>%
                filter(plate == p_number) %>%
                mutate(difference = pnorm2 - median_adj_size),
              aes(as.factor(col), reorder(as.factor(row), -row), fill = difference)) +
    scale_fill_gradientn(name = "Difference", na.value = "white",
                         colours = c("lightblue1", "cyan", "black", "yellow3",
                                     "yellow")
                         ) +
    coord_fixed(0.8) +
    theme_light(base_size = 26) + theme(panel.grid.major = element_blank(),
                          panel.grid.minor = element_blank()) +
    scale_y_discrete(limits = as.character(rev(seq(5, 28, 1))),
                     breaks = as.character(seq(5, 28, 1))) +
    scale_x_discrete(limits = as.character((seq(5, 44, 1))),
                     breaks = as.character(seq(5, 44, 1))) +
    labs(x = "Column", y = "Row",
         subtitle = paste("Plate", p_number))
  ,
  ncol = 1)

Lastly, we look at each query strain against the wild type in a scatter plot after the normalization.

# Data frame preparation
df_temp_Final_spread_norm <- df_temp_Final_Normalized  %>%
  select(plate, well, size, circularity, ORF, Query, pnorm4) %>%
  group_by(ORF, Query) %>%
  summarise(size = median(pnorm4, na.rm = T)) %>%
  spread(key = Query, value = size)

# Scatter plot
grid.arrange(
    ggplot(df_temp_Final_spread_norm) +
      geom_point(aes(WT, G1_RNH2), alpha = 0.3),
    ggplot(df_temp_Final_spread_norm) +
      geom_point(aes(WT, G2M_RNH2), alpha = 0.3),
    ggplot(df_temp_Final_spread_norm) +
      geom_point(aes(WT, S_RNH2), alpha = 0.3),
    ncol = 2,
    nrow = 2,
    top = textGrob(paste("Colony size (Normalized)"),
                   gp = gpar(fontsize = 24))
  )

With the data normalized, the whole data frame is simplified for further analysis.

# Simplify the normalized data table
df_KO_norm <- photoData_Final_Normalized %>%
                          select(plate, well, ORF, Query, size, circularity,
                                 IsBorder, IsEmpty, JK.TRUE, pnorm4)

# Remove the empty border rows
df_KO_norm <- anti_join(df_KO_norm, BorderColonies, by = "well")

5 Data Analysis

In this section, the difference between a query and wild type in each mutant is calculated and T-tests performed to determine the significance of the effect. The colonies that contribute to more than 90% of total variance in their replicates are not used for any calculations.

The p-values are corrected for multiple testing using the Benjamini-Hochberg method. The results are shown as volcano plots with the significant hits listed in the tables below each plot.

# Prepare the data table to make calculations easier
df_KO_norm <- df_KO_norm %>%
  mutate(wt = Query == "WT",
         G1 = Query == "G1_RNH2",
         S = Query == "S_RNH2",
         G2M = Query == "G2M_RNH2")

# Dividing the colony sizes by the median of all colonies for that query
df_KO_norm_2 <- df_KO_norm %>%
  group_by(Query) %>%
  mutate(pnorm4_adj = pnorm4 / median(pnorm4, na.rm = T)) %>%
  ungroup()

# Changing NA positions to zero for T-test calculations.
df_KO_norm_3 <- df_KO_norm_2
df_KO_norm_3$pnorm4_adj[is.na(df_KO_norm_3$pnorm4_adj)] <- 0

# Calculating the values for WT strains
# The mean of the WT strains are calculated after applying the Jackknife filter.
wt_KO <- filter(df_KO_norm_3, wt, !JK.TRUE) %>%
  group_by(ORF) %>%
  dplyr::summarise_if(is.numeric, mean, na.rm = T) %>%
  ungroup()

# Pre-table for plotting.
# Comparing the query against WT strains.
df_KO_inCondition <- filter(df_KO_norm_3, !wt) %>%
  inner_join(., wt_KO,
             by = c("ORF"), suffix = c("", "_WTMean")) %>%
  select(-plate_WTMean) %>%
  mutate(delta_size_pnorm4 = pnorm4_adj / pnorm4_adj_WTMean)

Before proceeding to T-test calculations, several thresholds are set here.

# Change these value to the desired threshold. Currently the settings are:

# 20% decrease = 0.8
sizeThresLow <- log2(0.8)

# BH corrected p-value
BHThres <- 0.05

5.1 G1-RNH202 query

5.1.1 T-tests

# Setting up dataframe
forttest_in_G1_RNH2 <- df_KO_norm_3 %>%
  filter(!S, !G2M) %>%
  drop_na(ORF)

# Extracting list of all ORFs present in data frame
uniqlist <- unique(forttest_in_G1_RNH2$ORF) 

# Building the table to store the t-test p-values
ttestdat <- data.frame(ORF = uniqlist)
ttestdat$pvalue_delta_size_pnorm4 <- NA

# Calculating the t-test p-values
for (i in 1:length(uniqlist)) {
  tryCatch({
    ttestdat[i, "pvalue_delta_size_pnorm4"] <- t.test(pnorm4_adj ~ Query, 
                          data = (forttest_in_G1_RNH2 %>% 
                                  filter(ORF == (uniqlist[i]),
                                         !JK.TRUE,
                                         is.finite(pnorm4_adj))))$p.value
  }, error = function(e){cat("ERROR :",conditionMessage(e), "\n")})
}


# Calculating the Benjamini-Hochberg corrected p-value
ttestdat$BH_delta_size_pnorm4 <- p.adjust(ttestdat$pvalue_delta_size_pnorm4,
                                          method = "BH")

# Joining the t-test data with the plotting data
df_KO_final_in_G1_RNH2 <- ttestdat %>%
  inner_join(., df_KO_inCondition %>%
               filter(G1, !JK.TRUE) %>%
               group_by(ORF) %>%
               summarise_if(is.numeric, mean, na.rm = T),
             by = c("ORF")) %>%
  mutate(log2delta_size_pnorm4 = log2(delta_size_pnorm4))

5.1.2 Volcano plot

After performing the calculations, the volcano plots are shown below. The thresholds applied are: BH corrected p-value < 0.05 with Log2FC(Colony Size) <= Log2(0.8) for significant decrease.

ggplot() + 
  geom_point(data = df_KO_final_in_G1_RNH2,
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2) +
  geom_point(data = df_KO_final_in_G1_RNH2 %>%
               filter(log2delta_size_pnorm4 <= sizeThresLow,
                      BH_delta_size_pnorm4 < BHThres),
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2, col = "red") +
  scale_x_continuous(breaks = seq(-20, 20, by = 0.5)) +
  scale_y_continuous(breaks = seq(-20, 20, by = 0.5)) +
  annotate("text", x = -2, y = 1,
           label = paste0("Significant decrease: \n",
              (100*(round(nrow(df_KO_final_in_G1_RNH2 %>%
                      filter(log2delta_size_pnorm4 <= sizeThresLow,
                             BH_delta_size_pnorm4 < BHThres)
                      )/nrow(df_KO_final_in_G1_RNH2), digits = 3))), "%"),
           col = "red") +
  labs(x = "Log2FC(Colony Size)", y = "-Log10(BH-adjusted p-value)",
       title = "Volcano plot with significant hits highlighted.",
       subtitle = paste0("Condition: G1_RNH202 query; Normalised colony size",
                         "\nThresholds applied:",
                         "\nBH-adjusted p-value < ", BHThres,
                         "\nLog2FC(Colony Size) <= Log2(",
                         round(2^sizeThresLow, digits = 3),
                         ") for significant decrease")
       )

Hits_G1 <- as.data.frame(
    df_KO_final_in_G1_RNH2 %>%
      filter(log2delta_size_pnorm4 <= sizeThresLow,
             BH_delta_size_pnorm4 < BHThres) %>%
      select(ORF, log2delta_size_pnorm4, BH_delta_size_pnorm4) %>%
      arrange(log2delta_size_pnorm4) %>%
      `colnames<-`(., c("ORF", "Log2_FC", "BH Adj. pvalue")))
ORFs with significant decrease in colony sizes (using BH adjusted p-value threshold)
ORF Log2_FC BH Adj. pvalue
YDR281C -Inf 0.003
YDR279W -3.914 0.002
YDR278C -2.070 0.012
YDR034C -1.402 0.045
YPR133W-A -1.261 0.041
YDR283C -0.777 0.012
YNL250W -0.664 0.039
YOL033W -0.637 0.049
YGR240C -0.571 0.009
YDR285W -0.563 0.028
YDR296W -0.542 0.045
YDR287W -0.539 0.047
YPR024W -0.536 0.012
YKL155C -0.486 0.037
YCR071C -0.447 0.028
YMR098C -0.421 0.027
YLR382C -0.394 0.028
YLR234W -0.351 0.034
YNL080C -0.339 0.037
YDR264C -0.334 0.042
YDR244W -0.334 0.034
YML090W -0.324 0.029

5.2 S-RNH202 query

5.2.1 T-tests

# Setting up dataframe
forttest_in_S_RNH2 <- df_KO_norm_3 %>%
  filter(!G1, !G2M) %>%
  drop_na(ORF)

# Extracting list of all ORFs present in data frame
uniqlist <- unique(forttest_in_S_RNH2$ORF) 

# Building the table to store the t-test p-values
ttestdat <- data.frame(ORF = uniqlist)
ttestdat$pvalue_delta_size_pnorm4 <- NA

# Calculating the t-test p-values
for (i in 1:length(uniqlist)) {
  tryCatch({
    ttestdat[i, "pvalue_delta_size_pnorm4"] <- t.test(pnorm4_adj ~ Query, 
                          data = (forttest_in_S_RNH2 %>% 
                                  filter(ORF == (uniqlist[i]),
                                         !JK.TRUE,
                                         is.finite(pnorm4_adj))))$p.value
  }, error = function(e){cat("ERROR :",conditionMessage(e), "\n")})
}

# Calculating the Benjamini-Hochberg corrected p-value
ttestdat$BH_delta_size_pnorm4 <- p.adjust(ttestdat$pvalue_delta_size_pnorm4, method = "BH")


# Joining the t-test data with the plotting data
df_KO_final_in_S_RNH2 <- ttestdat %>%
  inner_join(., df_KO_inCondition %>%
               filter(S, !JK.TRUE) %>%
               group_by(ORF) %>%
               summarise_if(is.numeric, mean, na.rm = T),
             by = c("ORF")) %>%
  mutate(log2delta_size_pnorm4 = log2(delta_size_pnorm4))

5.2.2 Volcano plot

After performing the calculations, the volcano plots are shown below. The thresholds applied are: BH corrected p-value < 0.05 with Log2FC(Colony Size) <= Log2(0.8) for significant decrease.

ggplot() + 
  geom_point(data = df_KO_final_in_S_RNH2,
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2) +
  geom_point(data = df_KO_final_in_S_RNH2 %>%
               filter(log2delta_size_pnorm4 <= sizeThresLow,
                      BH_delta_size_pnorm4 < BHThres),
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2, col = "red") +
  scale_x_continuous(breaks = seq(-20, 20, by = 0.5)) +
  scale_y_continuous(breaks = seq(-20, 20, by = 0.5)) +
  annotate("text", x = -2, y = 1,
           label = paste0("Significant decrease: \n",
              (100*(round(nrow(df_KO_final_in_S_RNH2 %>%
                      filter(log2delta_size_pnorm4 <= sizeThresLow,
                             BH_delta_size_pnorm4 < BHThres)
                      )/nrow(df_KO_final_in_S_RNH2), digits = 3))), "%"),
           col = "red") +
  labs(x = "Log2FC(Colony Size)", y = "-Log10(BH-adjusted p-value)",
       title = "Volcano plot with significant hits highlighted.",
       subtitle = paste0("Condition: S_RNH202 query; Normalised colony size",
                         "\nThresholds applied:",
                         "\nBH-adjusted p-value < ", BHThres,
                         "\nLog2FC(Colony Size) <= Log2(",
                         round(2^sizeThresLow, digits = 3),
                         ") for significant decrease")
       )

Hits_S <- as.data.frame(
    df_KO_final_in_S_RNH2 %>%
      filter(log2delta_size_pnorm4 <= sizeThresLow,
             BH_delta_size_pnorm4 < BHThres) %>%
      select(ORF, log2delta_size_pnorm4, BH_delta_size_pnorm4) %>%
      arrange(log2delta_size_pnorm4) %>%
      `colnames<-`(., c("ORF", "Log2_FC", "BH Adj. pvalue")))
ORFs with significant decrease in colony sizes (using BH adjusted p-value threshold)
ORF Log2_FC BH Adj. pvalue
YDR276C -8.528 0.018
YNL250W -5.963 0.012
YNL265C -5.375 0.007
YAL035W -4.277 0.028
YGL033W -4.058 0.017
YMR224C -3.887 0.005
YDR277C -3.765 0.000
YDR250C -3.644 0.017
YML032C -2.581 0.034
YDR369C -2.521 0.005
YDR278C -2.082 0.017
YDR279W -1.619 0.002
YBR094W -1.228 0.010
YLR234W -1.174 0.004
YBR099C -1.039 0.000
YBR098W -0.858 0.006
YGL163C -0.855 0.007
YDR386W -0.815 0.000
YJL115W -0.754 0.040
YDR004W -0.745 0.009
YPL024W -0.736 0.003
YLL002W -0.723 0.022
YDR283C -0.707 0.019
YJR018W -0.702 0.017
YDR076W -0.690 0.011
YOL033W -0.686 0.012
YPR164W -0.632 0.032
YER087W -0.615 0.019
YLR382C -0.591 0.017
YNL284C -0.560 0.012
YKL155C -0.531 0.021
YER070W -0.524 0.036
YGR240C -0.521 0.007
YJL047C -0.480 0.012
YMR267W -0.460 0.034
YCR071C -0.453 0.017
YHR154W -0.447 0.014
YDR285W -0.439 0.030
YDR287W -0.435 0.002
YDR291W -0.413 0.040
YMR190C -0.347 0.021
YLR069C -0.340 0.010
YNL080C -0.337 0.041
YOR150W -0.330 0.043
YMR098C -0.329 0.040
YDR259C -0.325 0.031

5.3 G2M-RNH202 query

5.3.1 T-tests

# Setting up dataframe
forttest_in_G2M_RNH2 <- df_KO_norm_3 %>%
  filter(!G1, !S) %>%
  drop_na(ORF)

# Extracting list of all ORFs present in data frame
uniqlist <- unique(forttest_in_G2M_RNH2$ORF) 

# Building the table to store the t-test p-values
ttestdat <- data.frame(ORF = uniqlist)
ttestdat$pvalue_delta_size_pnorm4 <- NA

# Calculating the t-test p-values
for (i in 1:length(uniqlist)) {
  tryCatch({
    ttestdat[i, "pvalue_delta_size_pnorm4"] <- t.test(pnorm4_adj ~ Query, 
                          data = (forttest_in_G2M_RNH2 %>% 
                                  filter(ORF == (uniqlist[i]),
                                         !JK.TRUE,
                                         is.finite(pnorm4_adj))))$p.value
  }, error = function(e){cat("ERROR :",conditionMessage(e), "\n")})
}

# Calculating the Benjamini-Hochberg corrected p-value
ttestdat$BH_delta_size_pnorm4 <- p.adjust(ttestdat$pvalue_delta_size_pnorm4, method = "BH")


# Joining the t-test data with the plotting data
df_KO_final_in_G2M_RNH2 <- ttestdat %>%
  inner_join(., df_KO_inCondition %>%
               filter(G2M, !JK.TRUE) %>%
               group_by(ORF) %>%
               summarise_if(is.numeric, mean, na.rm = T),
             by = c("ORF")) %>%
  mutate(log2delta_size_pnorm4 = log2(delta_size_pnorm4))

5.3.2 Volcano plot

After performing the calculations, the volcano plots are shown below. The thresholds applied are: BH corrected p-value < 0.05 with Log2FC(Colony Size) <= Log2(0.8) for significant decrease.

ggplot() + 
  geom_point(data = df_KO_final_in_G2M_RNH2,
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2) +
  geom_point(data = df_KO_final_in_G2M_RNH2 %>%
               filter(log2delta_size_pnorm4 <= sizeThresLow,
                      BH_delta_size_pnorm4 < BHThres),
             aes(log2delta_size_pnorm4, -log10(BH_delta_size_pnorm4)),
             alpha = 0.2, col = "red") +
  scale_x_continuous(breaks = seq(-20, 20, by = 0.5)) +
  scale_y_continuous(breaks = seq(-20, 20, by = 0.5)) +
  annotate("text", x = -2, y = 1,
           label = paste0("Significant decrease: \n",
              (100*(round(nrow(df_KO_final_in_G2M_RNH2 %>%
                      filter(log2delta_size_pnorm4 <= sizeThresLow,
                             BH_delta_size_pnorm4 < BHThres)
                      )/nrow(df_KO_final_in_G2M_RNH2), digits = 3))), "%"),
           col = "red") +
  labs(x = "Log2FC(Colony Size)", y = "-Log10(BH-adjusted p-value)",
       title = "Volcano plot with significant hits highlighted.",
       subtitle = paste0("Condition: G2M_RNH202 query; Normalised colony size",
                         "\nThresholds applied:",
                         "\nBH-adjusted p-value < ", BHThres,
                         "\nLog2FC(Colony Size) <= Log2(",
                         round(2^sizeThresLow, digits = 3),
                         ") for significant decrease")
       )

Hits_G2M <- as.data.frame(
    df_KO_final_in_G2M_RNH2 %>%
      filter(log2delta_size_pnorm4 <= sizeThresLow,
             BH_delta_size_pnorm4 < BHThres) %>%
      select(ORF, log2delta_size_pnorm4, BH_delta_size_pnorm4) %>%
      arrange(log2delta_size_pnorm4) %>%
      `colnames<-`(., c("ORF", "Log2_FC", "BH Adj. pvalue")))
ORFs with significant decrease in colony sizes (using BH adjusted p-value threshold)
ORF Log2_FC BH Adj. pvalue
YIL097W -Inf 0.021
YDR276C -Inf 0.032
YDR283C -0.643 0.024
YOL033W -0.620 0.027
YCR071C -0.607 0.039
YKL155C -0.598 0.041
YDR285W -0.394 0.026
YLR236C -0.325 0.031

6 GO Term Enrichment Analysis

After obtaining the list of hits for all three cell cycle restricted alleles of RNH202, GO Term analysis is performed.

6.1 Data preparation and load

First, the necessary list of ORFs are prepared as data tables for the analysis process. The data tables needed are:

  1. Background: all ORFs in the screen

  2. Hits: ORFs that were hits in the respective queries

Note: RNH202 was removed from all hits tables as it is the gene tagged in all queries.

# Listing of ORFs
Intermine_BG <- df_KO_final_in_G1_RNH2 %>%
  select(ORF) %>%
  filter(!grepl('UNKNOWN', ORF))

Intermine_hits_G1 <- Hits_G1 %>%
  filter(!grepl('UNKNOWN', ORF), ORF != "YDR279W") %>%
  select(ORF)

Intermine_hits_G2M <- Hits_G2M %>%
  filter(!grepl('UNKNOWN', ORF), ORF != "YDR279W") %>%
  select(ORF)

Intermine_hits_S <- Hits_S %>%
  filter(!grepl('UNKNOWN', ORF), ORF != "YDR279W") %>%
  select(ORF)

Using the background list of ORFs, the Gene –> GO Terms template on YeastMine (https://yeastmine.yeastgenome.org/yeastmine/) was used to obtain the list of all related GO Terms. The list is loaded as below.

# Locate and read in the most recent GO Term data
# All GO Term data downloaded were saved in "YYYYMMDD_YM_Gene_GOTerm_BG"
# format, and thus the most recent file would be the last in the list.
GO_Data <- list.files(pattern = "YM_Gene_GOTerm_BG.csv")
Full_GO_Data <- read.csv(paste0(GO_Data[length(GO_Data)]))

GO_Data <- list.files(pattern = "YM_Gene_GOTerm_BG_4785")
load(paste0(GO_Data[length(GO_Data)]))

print(paste0("GO Terms data loaded: ", GO_Data[length(GO_Data)]))
## [1] "GO Terms data loaded: 20210222_YM_Gene_GOTerm_BG_4785.rda"
#The table is modified for the function to work
YM_Gene_GOTerm_BG <- YM_Gene_GOTerm_BG %>%
  mutate_all(as.character)

The next step is to subset the complete GO Term data to the set of hits for each query.

# Sub-setting the GO Terms data to the hits from the full list by matching ORF
# names with "Gene.secondaryIdentifier"
YM_Gene_GOTerm_Input_G1 <- semi_join(
  YM_Gene_GOTerm_BG,
  Intermine_hits_G1,
  by = c("Gene.secondaryIdentifier" = "ORF"))

YM_Gene_GOTerm_Input_G2M <- semi_join(
  YM_Gene_GOTerm_BG,
  Intermine_hits_G2M,
  by = c("Gene.secondaryIdentifier" = "ORF"))

YM_Gene_GOTerm_Input_S <- semi_join(
  YM_Gene_GOTerm_BG,
  Intermine_hits_S,
  by = c("Gene.secondaryIdentifier" = "ORF"))

6.2 Enrichment analysis

With all the necessary data tables prepared, the GO Term enrichment analysis is performed using a hypergeometric distribution as described in InterMine’s documentation (https://intermine.readthedocs.io/en/latest/embedding/list-widgets/enrichment-widgets/). The p-values obtained are then corrected for multiple testing using the Benjamini-Hochberg method.

Note: The hypergeometric test process done takes a long time due to the large number of ORFs. Thus, it is easier to do this once whenever the downloaded data is refreshed, save as an R object, and then reload the data when needed. The codes below are an example of the calculation performed.

# Performing hypergeometric test to obtain p-value for enrichment
# Note: The hypergeometric test process done takes a long time due to the large
# number of ORFs. Thus, it is easier to do this once whenever the downloaded
# data is refreshed, save as an R object, and then reload the data when needed.
GOEnrichment_Hits_G1 <- EnrichGOTERMS(
  background = YM_Gene_GOTerm_BG, input = YM_Gene_GOTerm_Input_G1,
  BGlist = Intermine_BG, Inputlist = Intermine_hits_G1)

GOEnrichment_Hits_G2M <- EnrichGOTERMS(
  background = YM_Gene_GOTerm_BG, input = YM_Gene_GOTerm_Input_G2M,
  BGlist = Intermine_BG, Inputlist = Intermine_hits_G2M)

GOEnrichment_Hits_S <- EnrichGOTERMS(
  background = YM_Gene_GOTerm_BG, input = YM_Gene_GOTerm_Input_S,
  BGlist = Intermine_BG, Inputlist = Intermine_hits_S)

# Multiple testing correction using Benjamini-Hochberg correction
# Note: The correction for multiple testing is done per category (Biological
# process, Cellular component, and Molecular function). Information on how the
# grouping was done for the multiple testing correction could not be found but
# this is most likely the method used on YeastMine's website as each category
# is on a different tab.
GOEnrichment_Hits_G1 <- GOEnrichment_Hits_G1 %>%
  group_by(Gene.goAnnotation.ontologyTerm.parents.namespace) %>%
  mutate(BH = p.adjust(pval, method = "BH"))

GOEnrichment_Hits_G2M <- GOEnrichment_Hits_G2M %>%
  group_by(Gene.goAnnotation.ontologyTerm.parents.namespace) %>%
  mutate(BH = p.adjust(pval, method = "BH"))

GOEnrichment_Hits_S <- GOEnrichment_Hits_S %>%
  group_by(Gene.goAnnotation.ontologyTerm.parents.namespace) %>%
  mutate(BH = p.adjust(pval, method = "BH"))

# The tables are saved as R objects for easy and quick load.
#save(GOEnrichment_Hits_G1,
#     file = paste0((format(Sys.time(), "%Y%m%d")),
#                   "_GO_Enrichments_Hits_G1.rda"))
#save(GOEnrichment_Hits_G2M,
#     file = paste0((format(Sys.time(), "%Y%m%d")),
#                   "_GO_Enrichments_Hits_G2M.rda"))
#save(GOEnrichment_Hits_S,
#     file = paste0((format(Sys.time(), "%Y%m%d")),
#                   "_GO_Enrichments_Hits_S.rda"))

6.3 GO Term enrichment plots

The most recent GO Term enrichment data that were calculated are loaded for further analysis.

# Locate and load in the most recent GO Term enrichment data
# All GO Term enrichment data were saved in "YYYYMMDD_GO_Enrichments_Hits_Query"
# format, and thus the most recent file would be the last in the list.
GO_Data_Hits_G1 <- list.files(pattern = "_GO_Enrichments_Hits_G1")
GO_Data_Hits_G2M <- list.files(pattern = "_GO_Enrichments_Hits_G2M")
GO_Data_Hits_S <- list.files(pattern = "_GO_Enrichments_Hits_S")
load(paste0(GO_Data_Hits_G1[length(GO_Data_Hits_G1)]))
load(paste0(GO_Data_Hits_G2M[length(GO_Data_Hits_G2M)]))
load(paste0(GO_Data_Hits_S[length(GO_Data_Hits_S)]))

# Setting the ggplot theme for this section
theme_set(theme_light(base_size = 30))

# Threshold for the corrected p-value for GO Term enrichment data
BHThres_GO <- 0.1

The above enrichment calculations are visualized in the plots below by query and by category. Only the top 10 GO terms for each category are shown with a threshold set at BH-adjusted p-value < 0.1

6.3.1 G1-RNH202 query

# Some variables for plotting
max_value <- ceiling(max(-log10(GOEnrichment_Hits_G1$BH)))

#G1 GO TERM Plots
ggplot(data = GOEnrichment_Hits_G1 %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "molecular_function",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G1-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Molecular Function ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G1)))

ggplot(data = GOEnrichment_Hits_G1 %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "biological_process",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G1-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Biological Process ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G1)))

ggplot(data = GOEnrichment_Hits_G1 %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "cellular_component",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G1-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Cellular Component ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G1)))

6.3.2 S-RNH202 query

# Some variables for plotting
max_value <- ceiling(max(-log10(GOEnrichment_Hits_S$BH)))

#S GO TERM Plots
ggplot(data = GOEnrichment_Hits_S %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "molecular_function",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.5),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "S-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Molecular Function ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_S)))

ggplot(data = GOEnrichment_Hits_S %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "biological_process",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.5),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "S-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Biological Process ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_S)))

ggplot(data = GOEnrichment_Hits_S %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "cellular_component",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.5),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "S-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Cellular Component ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_S)))

6.3.3 G2M-RNH202 query

# Some variables for plotting
max_value <- ceiling(max(-log10(GOEnrichment_Hits_G2M$BH)))

#G2M GO TERM Plots
ggplot(data = GOEnrichment_Hits_G2M %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "molecular_function",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G2M-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Molecular Function ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G2M)))

ggplot(data = GOEnrichment_Hits_G2M %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "biological_process",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G2M-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Biological Process ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G2M)))

ggplot(data = GOEnrichment_Hits_G2M %>%
     filter(Gene.goAnnotation.ontologyTerm.parents.namespace == "cellular_component",
            BH < BHThres_GO) %>%
     arrange(BH) %>% head(10),
     aes(reorder(Gene.goAnnotation.ontologyTerm.parents.name, -log10(BH)),
         -log10(BH))) +
  geom_col() +
  geom_text(aes(y = (-log10(BH) + 0.1),
                label = paste(Hits)), size = 8) +
  scale_y_continuous(limits = c(0, max_value),
                     breaks = seq(0, max_value, 1),
                     expand = expand_scale(mult = 0, add = c(0,0.5))) +
  coord_flip() +
  labs(x = "GO Terms",
       y = "-Log10(Adj. p-value)",
       title = "G2M-RNH202 query",
       subtitle = paste0("Top 10 GO Terms for Cellular Component ",
                         "(p < ", BHThres_GO, ")",
                         "\nNo. of Hits: ", nrow(Intermine_hits_G2M)))

7 R Studio Session Info

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] bootstrap_2019.6 logging_0.9-107  gridExtra_2.3    openxlsx_4.1.0  
##  [5] gdata_2.18.0     kableExtra_1.0.1 forcats_0.4.0    stringr_1.4.0   
##  [9] dplyr_0.8.0.1    purrr_0.3.1      readr_1.3.1      tidyr_0.8.3     
## [13] tibble_2.1.3     ggplot2_3.1.1    tidyverse_1.2.1 
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.8.1      tidyselect_0.2.5  xfun_0.5          haven_2.1.0      
##  [5] lattice_0.20-38   colorspace_1.4-1  generics_0.0.2    htmltools_0.3.6  
##  [9] viridisLite_0.3.0 yaml_2.2.0        rlang_0.3.4       pillar_1.3.1     
## [13] glue_1.3.1        withr_2.1.2       modelr_0.1.4      readxl_1.3.0     
## [17] jpeg_0.1-8        plyr_1.8.4        munsell_0.5.0     gtable_0.2.0     
## [21] cellranger_1.1.0  zip_2.0.0         rvest_0.3.2       evaluate_0.13    
## [25] labeling_0.3      knitr_1.28        highr_0.7         broom_0.5.1      
## [29] Rcpp_1.0.0        scales_1.0.0      backports_1.1.3   webshot_0.5.1    
## [33] jsonlite_1.6      png_0.1-7         hms_0.4.2         digest_0.6.19    
## [37] stringi_1.3.1     cli_1.0.1         tools_3.5.0       magrittr_1.5     
## [41] lazyeval_0.2.1    crayon_1.3.4      pkgconfig_2.0.2   xml2_1.2.0       
## [45] lubridate_1.7.4   assertthat_0.2.0  rmarkdown_2.2     httr_1.4.0       
## [49] rstudioapi_0.9.0  R6_2.4.0          nlme_3.1-137      compiler_3.5.0